Summarize the reviews


In [1]:
all_reviews = (spark
    .read
    .json('../../data/raw_data/reviews_Home_and_Kitchen_5.json.gz'))

In [3]:
from pyspark.sql.functions import col, expr, udf, trim
from pyspark.sql.types import IntegerType
import re

remove_punctuation = udf(lambda line: re.sub('[^A-Za-z\s]', '', line))
make_binary = udf(lambda rating: 0 if rating in [1, 2] else 1, IntegerType())

reviews = (all_reviews
    .na.fill({ 'reviewerName': 'Unknown' })
    .filter(col('overall').isin([1, 2, 5]))
    .withColumn('label', make_binary(col('overall')))
    .select(col('label').cast('int'), remove_punctuation('summary').alias('summary'))
    .filter(trim(col('summary')) != ''))

Splitting data and balancing skewness


In [4]:
train, test = reviews.randomSplit([.8, .2], seed=5436L)

In [5]:
def multiply_dataset(dataset, n):
    return dataset if n <= 1 else dataset.union(multiply_dataset(dataset, n - 1))

In [6]:
reviews_good = train.filter('label == 1')
reviews_bad = train.filter('label == 0')

reviews_bad_multiplied = multiply_dataset(reviews_bad, reviews_good.count() / reviews_bad.count())


train_reviews = reviews_bad_multiplied.union(reviews_good)

Benchmark: predict by distribution


In [13]:
accuracy = reviews_good.count() / float(train.count())
print('Always predicting 5 stars accuracy: {0}'.format(accuracy))


Always predicting 5 stars accuracy: 0.87139780791

Learning pipeline


In [8]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol='summary', outputCol='words')

pipeline = Pipeline(stages=[
    tokenizer, 
    StopWordsRemover(inputCol='words', outputCol='filtered_words'),
    HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000),
    IDF(inputCol='rawFeatures', outputCol='features'),
    LogisticRegression(regParam=.3, elasticNetParam=.01)
])

Testing the model accuracy


In [9]:
model = pipeline.fit(train_reviews)

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

prediction = model.transform(test)
BinaryClassificationEvaluator().evaluate(prediction)


Out[10]:
0.9168045600888572

Using model to extract the most predictive words


In [11]:
from pyspark.sql.functions import explode
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType

words = (tokenizer
    .transform(reviews)
    .select(explode(col('words')).alias('summary')))

predictors = (model
    .transform(words)
    .select(col('summary').alias('word'), 'probability'))

first = udf(lambda x: x[0].item(), FloatType())
second = udf(lambda x: x[1].item(), FloatType())

predictive_words = (predictors
   .select(
       'word', 
       second(col('probability')).alias('positive'), 
       first(col('probability')).alias('negative'))
   .groupBy('word')
   .agg(
       F.max('positive').alias('positive'),
       F.max('negative').alias('negative')))

positive_predictive_words = (predictive_words
    .select(col('word').alias('positive_word'), col('positive').alias('pos_prob'))
    .sort('pos_prob', ascending=False))

negative_predictive_words = (predictive_words
    .select(col('word').alias('negative_word'), col('negative').alias('neg_prob'))
    .sort('neg_prob', ascending=False))

In [12]:
import pandas as pd
pd.set_option('display.max_rows', 100)

pd.concat(
    [ positive_predictive_words.limit(100).toPandas(),
      negative_predictive_words.limit(100).toPandas() ],
    axis=1)


Out[12]:
positive_word pos_prob negative_word neg_prob
0 toxic 0.702056 worst 0.693118
1 perfect 0.702056 za 0.681185
2 excellent 0.698975 disappointed 0.681185
3 awesome 0.695059 disappointing 0.677256
4 fantastic 0.690034 disappointment 0.669026
5 dalla 0.689843 terrible 0.667437
6 amazing 0.689843 poor 0.667185
7 wonderful 0.687460 useless 0.660330
8 five 0.683558 bango 0.660330
9 fabulous 0.678689 worthless 0.658721
10 bailey 0.677647 gingergarlic 0.658486
11 handy 0.677647 flimsy 0.658486
12 blox 0.677111 grabber 0.658486
13 regime 0.677111 returned 0.658471
14 love 0.677111 poorly 0.652594
15 best 0.673670 junk 0.652316
16 great 0.670816 jarsgreat 0.652202
17 expectedgot 0.670816 hamiliton 0.652202
18 perfection 0.667853 defective 0.652202
19 silex 0.666016 awful 0.651954
20 loves 0.666016 infuse 0.651954
21 needed 0.664291 coctails 0.651954
22 perfectly 0.663811 meh 0.651746
23 outstanding 0.662990 ok 0.651042
24 wowloving 0.662849 microbopper 0.651042
25 exactly 0.662849 broke 0.648492
26 saves 0.660796 negive 0.647956
27 terrific 0.660496 postal 0.647956
28 classy 0.659689 horrible 0.647956
29 beat 0.659624 cheaply 0.644965
30 solved 0.659568 dangerous 0.644139
31 simple 0.658571 breaks 0.642654
32 finally 0.658427 eh 0.642366
33 ruffled 0.658241 charges 0.642366
34 beautiful 0.658241 okay 0.642030
35 yay 0.655354 mediocre 0.641426
36 easy 0.655322 flawed 0.641148
37 fun 0.654575 weak 0.639523
38 pleasantly 0.652444 managing 0.639406
39 sooner 0.649271 lousy 0.639406
40 versatile 0.649171 broken 0.637405
41 sharpened 0.649171 beware 0.636521
42 gift 0.648284 doesnt 0.636136
43 gorgeous 0.646563 akrobins 0.636136
44 mugthermos 0.646563 madewelding 0.634321
45 wow 0.643426 rusted 0.634321
46 casingborder 0.643426 uncomfortable 0.634118
47 nice 0.643066 nightmare 0.633125
48 ont 0.642420 shoddiest 0.633116
49 solid 0.642420 cracked 0.633116
50 favorite 0.641204 soso 0.633033
51 elegant 0.641035 handheal 0.633033
52 charm 0.640674 garbage 0.632779
53 value 0.640382 overpriced 0.631248
54 yummy 0.639512 fail 0.630755
55 kiss 0.639512 died 0.630731
56 penny 0.638611 short 0.628559
57 cozy 0.638148 frustrating 0.628401
58 superb 0.636925 rusts 0.628361
59 affordable 0.636830 noisy 0.628036
60 heaven 0.636549 lasted 0.627848
61 exceeded 0.635176 stinks 0.627412
62 saver 0.634595 pumped 0.626616
63 comfy 0.634482 theory 0.626616
64 accessorie 0.634482 dissapointed 0.626493
65 tool 0.634073 bad 0.626172
66 sturdy 0.633408 seniors 0.626172
67 nonbasic 0.633408 helpers 0.626172
68 lovely 0.632788 nori 0.626172
69 kitchen 0.632656 cornerssee 0.625675
70 canning 0.631700 awkward 0.625675
71 highly 0.629741 mata 0.625675
72 addition 0.629598 unreliable 0.625524
73 measuring 0.629445 uneven 0.625510
74 storage 0.628716 simpleyet 0.624851
75 essential 0.628671 difficult 0.624851
76 unique 0.626989 leaked 0.623439
77 must 0.626507 worse 0.623415
78 complaints 0.626157 windw 0.623415
79 joining 0.625883 concept 0.623266
80 cake 0.625883 waste 0.622812
81 mom 0.625141 windowmounted 0.622812
82 beats 0.624665 leaks 0.622331
83 aquality 0.624426 leaky 0.622287
84 organized 0.624426 misleading 0.622105
85 husband 0.624266 fragile 0.621708
86 helps 0.624006 disapointed 0.621552
87 pleasure 0.622396 loveable 0.621552
88 loving 0.622124 crap 0.621500
89 convenient 0.621260 warped 0.621298
90 works 0.620345 yuck 0.620694
91 baking 0.620343 nystrip 0.620396
92 fits 0.619780 rusty 0.620396
93 windtunel 0.619480 rip 0.620298
94 stylish 0.619480 inaccurate 0.619848
95 circulonthis 0.618929 hate 0.619539
96 workhorse 0.618929 asteroid 0.619539
97 wife 0.618862 dud 0.618871
98 delight 0.618565 crappy 0.618867
99 emglish 0.618565 skip 0.618234